The latest version on different platforms can be installed following instructions at http://bioconductor.org/install/#install-R.
Install packages:
source("http://bioconductor.org/biocLite.R")
biocLite(c("supraHex","devtools","XGR","dplyr"))
devtools::install_github(c("hfang-bristol/supraHex", "hfang-bristol/XGR"))
A collection of aesthetic maps supported for the self-organised representation and comparison: (from top-left to bottom-right) a supra-hexagonal map, a ring map, a diamond map, a trefoil map, a butterfly map, an hourglass map, a ladder map, a bridge map, and a triangle map.
library(supraHex)
library(ggplot2)
colormap <- 'lightyellow-lightblue'
color <- 'black'
# For "supraHex" shape itself
sHex <- sHexGridVariant(r=6, shape="suprahex")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_suprahex <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "triangle" shape
sHex <- sHexGridVariant(r=6, shape="triangle")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_triangle <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "diamond" shape
sHex <- sHexGridVariant(r=6, shape="diamond")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_diamond <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "hourglass" shape
sHex <- sHexGridVariant(r=6, shape="hourglass")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_hourglass <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "trefoil" shape
sHex <- sHexGridVariant(r=6, shape="trefoil")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_trefoil <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "ladder" shape
sHex <- sHexGridVariant(r=6, shape="ladder")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_ladder <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "butterfly" shape
sHex <- sHexGridVariant(r=6, shape="butterfly")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_butterfly <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "ring" shape
sHex <- sHexGridVariant(r=6, shape="ring")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_ring <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "bridge" shape
sHex <- sHexGridVariant(r=6, shape="bridge")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_bridge <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
gridExtra::grid.arrange(grobs=list(gp_suprahex,gp_ring,gp_diamond,gp_trefoil,gp_butterfly,gp_hourglass,gp_ladder,gp_bridge,gp_triangle), layout_matrix=rbind(c(1,1,2,2,3),c(1,1,2,2,3),c(4,4,5,5,6),c(4,4,5,5,6),c(7,7,8,8,9)), nrow=5, ncol=5)
In this showcase, we analyse TF and DHS datasets in K562, and compare TF datasets between K562 and Gm12878.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF and DHS datasets:
# extract TFBSs, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
## common to Gm12878 and K562
anno_gr_Gm12878 <- ENCODE_TFBS_ClusteredV3_CellTypes$GM12878
anno_gr_K562 <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
names_common <- sort(intersect(names(anno_gr_Gm12878), names(anno_gr_K562)))
ind <- match(names_common, names(anno_gr_Gm12878))
TF_gr_Gm12878 <- anno_gr_Gm12878[ind]
ind <- match(names_common, names(anno_gr_K562))
TF_gr_K562 <- anno_gr_K562[ind]
# extract DHS, stored in a list of GR objects
ENCODE_DNaseI_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_DNaseI_ClusteredV3_CellTypes', RData.location=RData.location)
## in K562
ind <- grep("K562", names(ENCODE_DNaseI_ClusteredV3_CellTypes))
DHS_gr_K562 <- ENCODE_DNaseI_ClusteredV3_CellTypes[ind]
Calculate gene-centric TF association scores:
anno_gr <- TF_gr_K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_K562 <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
Justify the use of the bridge-shaped map:
data <- G2TF_K562
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=48) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the bridge-shaped map using cross-TF gene scores:
data <- G2TF_K562
sMap <- sPipeline(data, shape="bridge", finetuneSustain=T)
Build neighbour-joining TF tree:
D <- t(sMap$codebook)
set.seed(825)
tree_bs <- visTreeBootstrap(D, num.bootstrap=1000, nodelabels.arg=list(frame="circle",cex=0.35,col="black"), plot.phylo.arg=list(type=c("phylogram","cladogram","fan","radial")[1],cex=0.6,node.pos=1))
Visualise TF map (reordered by tree):
tree_bs_ind <- match(tree_bs$tip.label, rownames(D))
xMap <- sMap
xMap$codebook <- xMap$codebook[,tree_bs_ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=51:1, rect.grid=c(7,8), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)
Calculate gene-centric DHS association scores:
anno_gr <- DHS_gr_K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=0, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, Cell=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2DHS <- as.matrix(xSparseMatrix(df[,c("Gene","Cell","Score")]))
The DHS map is obtained by overlaying the DHS data onto the trained TF map in K562:
ind <- match(rownames(G2TF_K562), rownames(G2DHS))
additional <- G2DHS[ind,]
additional[is.na(additional)] <- 0
additional <- matrix(additional, ncol=1)
rownames(additional) <- rownames(G2TF_K562)
colnames(additional) <- 'DHS'
sOverlay_DHS <- sMapOverlay(sMap, data=G2TF_K562, additional=additional)
Visualise DHS map:
colormap <- xColormap("jet.top")
visHexMulComp(sOverlay_DHS, rect.grid=c(1,1), title.rotate=0, colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=1), newpage=F)
Rankings of 51 TFs in terms of similarity (Pearson correlation) to the DHS map in K562:
M_K562 <- sMap$codebook
M_DHS <- sOverlay_DHS$codebook
y <- M_DHS[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_DHS <- xPolarDot(df, parallel=T, signature=F)
gp_DHS + labs(y="Pearson correlation") + ylim(0,1)
Calculate gene-centric TF association scores:
anno_gr <- TF_gr_Gm12878
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_Gm12878 <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
The TF map is obtained by overlaying the TF data in Gm12878 onto the trained TF map in K562:
ind <- match(rownames(G2TF_K562), rownames(G2TF_Gm12878))
additional <- G2TF_Gm12878[ind,]
additional[is.na(additional)] <- 0
sOverlay_Gm12878 <- sMapOverlay(sMap, data=G2TF_K562, additional=additional)
Rankings of 51 TFs in terms of similarity (Pearson correlation) in two cell lines (K562 and Gm12878):
M_K562 <- sMap$codebook
M_Gm12878 <- sOverlay_Gm12878$codebook
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
y <- M_Gm12878[,i]
names(y) <- 1:nrow(M_Gm12878)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_twin <- xPolarDot(df, parallel=T, signature=F)
gp_twin + labs(y="Pearson correlation") + ylim(0,1)
Visualise TF map (reordered by correlation):
## yMap
yMap <- sMap
ind <- match(df$TF, colnames(yMap$codebook))
yMap$codebook <- yMap$codebook[,ind]
## zMap
zMap <- sOverlay_Gm12878
ind <- match(df$TF, colnames(zMap$codebook))
zMap$codebook <- zMap$codebook[,ind]
## xMap
xMap <- sMap
nr <- nrow(xMap$codebook)
nc <- ncol(xMap$codebook)
matrix.combined <- matrix(NA, nrow=nr, ncol=nc*2)
matrix.combined[, seq(1,nc*2,2)] <- yMap$codebook
matrix.combined[, seq(2,nc*2,2)] <- zMap$codebook
xMap$codebook <- matrix.combined
colnames(xMap$codebook) <- rep(colnames(yMap$codebook), each=2)
## visualisation
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=1:(nc*2), rect.grid=c(13,8), title.rotate=0, title.xy=c(0.3,1), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.4), newpage=F)
Perform enrichment analysis for TFs:
## TFs (high correlation)
data <- subset(df, rank<=25)$TF
eTerm_high <- xEnricherGenes(data=data, ontology="SF", ontology.algorithm="none", RData.location=RData.location)
## TFs (low correlation)
data <- subset(df, rank>=27)$TF
eTerm_low <- xEnricherGenes(data=data, ontology="SF", ontology.algorithm="none", RData.location=RData.location)
## visualisation
list_eTerm <- list(eTerm_low, eTerm_high)
names(list_eTerm) <- c('TFs (low correlation)', 'TFs (high correlation)')
bp <- xEnrichCompare(list_eTerm, displayBy="fc", bar.label.size=2.5, signature=F)
bp + theme(axis.text.y=element_text(size=10))
Enriched domains for TFs (high correlation):
xEnrichViewer(eTerm_high, top=NULL, details=T)
> name nAnno nOverlap fc zscore
> 46785 "Winged helix" DNA-binding domain 212 6 17.40 9.72
> 47459 HLH, helix-loop-helix DNA-binding domain 113 3 16.30 6.61
> 57667 beta-beta-alpha zinc fingers 737 6 5.01 4.52
> pvalue adjp distance members
> 46785 2.8e-08 8.3e-08 a.4.5 E2F4, ELF1, ELK1, ETS1, RAD21, SPI1
> 47459 3.0e-05 4.4e-05 a.38.1 BHLHE40, USF1, USF2
> 57667 1.1e-04 1.1e-04 g.37.1 CTCF, EGR1, MAZ, YY1, ZBTB33, ZNF143
Enriched domains for TFs (low correlation):
xEnrichViewer(eTerm_low, top=NULL, details=T)
> name nAnno nOverlap fc zscore
> 57959 Leucine zipper domain 52 5 59.2 17.00
> 47459 HLH, helix-loop-helix DNA-binding domain 113 3 16.3 6.61
> 57667 beta-beta-alpha zinc fingers 737 3 2.5 1.70
> pvalue adjp distance members
> 57959 1.6e-10 4.9e-10 h.1.3 ATF3, CEBPB, FOS, JUND, NFE2
> 47459 3.0e-05 4.4e-05 a.38.1 MAX, MXI1, MYC
> 57667 2.9e-02 2.9e-02 g.37.1 REST, SP1, ZNF274
In this showcase, we analyse TF datasets in K562, Gm12878 and H1hESC.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF datasets shared by K562, Gm12878 and H1hESC:
# extract TFBSs, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
## common to H1-hESC, Gm12878 and K562
anno_gr_H1hESC <- ENCODE_TFBS_ClusteredV3_CellTypes$'H1-hESC'
names_common <- sort(base::Reduce(intersect, list(names(anno_gr_Gm12878), names(anno_gr_K562), names(anno_gr_H1hESC))))
ind <- match(names_common, names(anno_gr_K562))
TF_gr_K562_tri <- anno_gr_K562[ind]
ind <- match(names_common, names(anno_gr_Gm12878))
TF_gr_Gm12878_tri <- anno_gr_Gm12878[ind]
ind <- match(names_common, names(anno_gr_H1hESC))
TF_gr_H1hESC_tri <- anno_gr_H1hESC[ind]
Calculate gene-centric TF association scores:
## G2TF_K562_tri
anno_gr <- TF_gr_K562_tri
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_K562_tri <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
## G2TF_Gm12878_tri
anno_gr <- TF_gr_Gm12878_tri
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_Gm12878_tri <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
## G2TF_H1hESC_tri
anno_gr <- TF_gr_H1hESC_tri
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_H1hESC_tri <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
Train the bridge-shaped map using cross-TF gene scores in K562:
data <- G2TF_K562_tri
sMap_tri <- sPipeline(data, shape="bridge", finetuneSustain=T)
TF map in Gm12878 (overlaid onto the trained TF map in K562):
ind <- match(rownames(G2TF_K562_tri), rownames(G2TF_Gm12878_tri))
additional <- G2TF_Gm12878_tri[ind,]
additional[is.na(additional)] <- 0
sOverlay_Gm12878_tri <- sMapOverlay(sMap_tri, data=G2TF_K562_tri, additional=additional)
TF map in H1hESC (overlaid onto the trained TF map in K562):
ind <- match(rownames(G2TF_K562_tri), rownames(G2TF_H1hESC_tri))
additional <- G2TF_H1hESC_tri[ind,]
additional[is.na(additional)] <- 0
sOverlay_H1hESC_tri <- sMapOverlay(sMap_tri, data=G2TF_K562_tri, additional=additional)
Correlation analysis of TF’s similarity between Gm12878 and H1hESC in terms of K562:
M_K562 <- sMap_tri$codebook
M_Gm12878 <- sOverlay_Gm12878_tri$codebook
M_H1hESC <- sOverlay_H1hESC_tri$codebook
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
## Gm12878
y <- M_Gm12878[,i]
names(y) <- 1:nrow(M_Gm12878)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary1 <- ls_res$df_summary
## H1hESC
y <- M_H1hESC[,i]
names(y) <- 1:nrow(M_H1hESC)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary2 <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], Gm12878=df_summary1$cor, H1hESC=df_summary2$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
## correlation plot
x <- df[,c('TF','Gm12878')]
y <- df$H1hESC
names(y) <- df$TF
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal", plot=T)
gp_tri <- ls_res$ls_gp[[1]]
gp_tri <- gp_tri + labs(x="Correlation between Gm12878 and K562", y="Correlation between H1hESC and K562") + xlim(0.5,1) + ylim(0.5,1)
gp_tri <- gp_tri + geom_abline(intercept=0, slope=1, color="gray50") + coord_fixed(ratio=1)
gp_tri <- gp_tri + ggrepel::geom_text_repel(data=gp_tri$data, aes(label=name), size=2, fontface='bold', box.padding=unit(0.2,"lines"), point.padding=unit(0.2,"lines"), segment.color='grey50', segment.alpha=0.5, arrow=arrow(length=unit(0.01,'npc')), force=0.1)
gp_tri
Visualise TF map (reordered by correlation difference):
## order by the difference
df$diff <- df$H1hESC - df$Gm12878
df <- df %>% dplyr::arrange(diff)
## xMap
xMap <- sMap_tri
ind <- match(df$TF, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
## yMap
yMap <- sOverlay_Gm12878_tri
ind <- match(df$TF, colnames(yMap$codebook))
yMap$codebook <- yMap$codebook[,ind]
## zMap
zMap <- sOverlay_H1hESC_tri
ind <- match(df$TF, colnames(zMap$codebook))
zMap$codebook <- zMap$codebook[,ind]
## aMap
aMap <- sMap_tri
nr <- nrow(xMap$codebook)
nc <- ncol(xMap$codebook)
matrix.combined <- matrix(NA, nrow=nr, ncol=nc*3)
### ordered by H1hESC Gm12878 K562
matrix.combined[, seq(1,nc*3,3)] <- zMap$codebook
matrix.combined[, seq(2,nc*3,3)] <- yMap$codebook
matrix.combined[, seq(3,nc*3,3)] <- xMap$codebook
aMap$codebook <- matrix.combined
colnames(aMap$codebook) <- rep(colnames((xMap$codebook)), each=3)
## visualisation
colormap <- xColormap("jet.top")
### reverse order by K562 Gm12878 H1hESC
visHexMulComp(aMap, which.components=(nc*3):1, rect.grid=c(9,9), title.rotate=0, title.xy=c(0.3,1), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.7), newpage=F)
In this showcase, we analyse TF datasets in K562 and expression datasets in chronic myeloid leukemia (CML, subdivided into blast crisis (BC), chronic phase (CP) and accelerated phase (AP)). Note, the K562 cell line was derived from a CML patient in blast crisis.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF datasets in K562 and expression datasets in CML:
# extract TF, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
# calculate gene-centric TF association scores
anno_gr <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
mat <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
# import CML expression
data.file <- file.path(RData.location,'GSE4170.txt')
data <- read.delim(data.file, header=TRUE, stringsAsFactors=FALSE)
data <- subset(data, gene!='')
## take average per gene
data <- as.data.frame(data %>% dplyr::group_by(gene) %>% dplyr::summarise(CP=mean(CP), AP=mean(AP), BC=mean(BC)))
# extract genes commons to expression and TF datasets in K562
ind <- match(data$gene, rownames(mat))
G2TF_K562_expression <- mat[ind[!is.na(ind)],]
G2EXP_CML <- data.frame(data[!is.na(ind), c('CP','AP','BC')], stringsAsFactors=F)
rownames(G2EXP_CML) <- data$gene[!is.na(ind)]
Justify the use of the butterfly-shaped map:
Note: both supra-hexagonal and butterfly shapes are valid, and we choose the butterfly one as there exist two major clusters of data points.
data <- G2TF_K562_expression
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=30) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the diamond-shaped map using cross-TF gene scores:
data <- G2TF_K562_expression
sMe <- sPipeline(data, shape="butterfly", finetuneSustain=T, scale=3)
The expression map is obtained by overlaying the CML expression data onto the trained TF map in K562:
sOverlay_EXP <- sMapOverlay(sMe, data=G2TF_K562_expression, additional=G2EXP_CML)
Visualise expression map:
colormap <- xColormap("jet.both")
visHexMulComp(sOverlay_EXP, rect.grid=c(1,3), title.rotate=0, title.xy=c(0.45,1), colormap=colormap, zlim=c(-0.4,0.4), gp=grid::gpar(cex=1), newpage=F)
Rankings of 100 TFs in terms of similarity (Pearson correlation) to the expression map in CML blast crisis phase (BC):
M_K562 <- sMe$codebook
M_EXP <- sOverlay_EXP$codebook
y <- M_EXP[,'BC']
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_EXP <- xPolarDot(df, colormap='blue-yellow-red', size=1.5, parallel=T, signature=F)
gp_EXP + labs(y="Pearson correlation") + scale_y_continuous(limits=c(-1,1))
Visualise TF map (reordered by correlation):
xMap <- sMe
ind <- match(df$TF, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=1:100, rect.grid=c(10,10), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)
Perform enrichment analysis for TFs:
## TFs (high correlation)
data <- subset(df, rank<=20)$TF
background <- NULL
eT_high <- xEnricherGenes(data=data, background=background, ontology="REACTOME", size.range=c(10,2000), test="fisher", min.overlap=5, ontology.algorithm="none", RData.location=RData.location)
## TFs (low correlation)
data <- subset(df, rank>=81)$TF
eT_low <- xEnricherGenes(data=data, background=background, ontology="REACTOME", size.range=c(10,2000), test="fisher", min.overlap=5, ontology.algorithm="none", RData.location=RData.location)
## visualisation
list_eTerm <- list(eT_low, eT_high)
names(list_eTerm) <- c('TFs (low correlation)', 'TFs (high correlation)')
bp <- xEnrichCompare(list_eTerm, displayBy="fc", bar.label.size=2.5, signature=F)
bp + theme(axis.text.y=element_text(size=10))
Enriched domains for TFs (high correlation):
xEnrichViewer(eT_high, top=NULL, details=T)
> name nAnno nOverlap fc
> R-HSA-212436 Generic Transcription Pathway 1063 11 6.05
> R-HSA-74160 Gene expression (Transcription) 1307 12 5.37
> R-HSA-73857 RNA Polymerase II Transcription 1185 11 5.43
> R-HSA-4839726 Chromatin organization 224 5 13.00
> R-HSA-3247509 Chromatin modifying enzymes 224 5 13.00
> R-HSA-3700989 Transcriptional Regulation by TP53 361 6 9.71
> R-HSA-69278 Cell Cycle, Mitotic 524 5 5.58
> R-HSA-1640170 Cell Cycle 600 5 4.87
> R-HSA-597592 Post-translational protein modification 1347 5 2.17
> R-HSA-392499 Metabolism of proteins 1890 5 1.55
> zscore pvalue adjp distance
> R-HSA-212436 7.21 1.3e-07 6.7e-07 3
> R-HSA-74160 7.01 8.4e-08 6.7e-07 1
> R-HSA-73857 6.72 4.1e-07 1.4e-06 2
> R-HSA-4839726 7.55 2.8e-05 4.6e-05 1
> R-HSA-3247509 7.55 2.8e-05 4.6e-05 2
> R-HSA-3700989 6.98 1.9e-05 4.6e-05 4
> R-HSA-69278 4.46 1.5e-03 2.1e-03 2
> R-HSA-1640170 4.05 2.7e-03 3.3e-03 1
> R-HSA-597592 1.91 6.9e-02 7.7e-02 2
> R-HSA-392499 1.09 2.1e-01 2.1e-01 1
> members
> R-HSA-212436 E2F4, ELF1, HDAC1, KDM5B, MYC, PML, POLR2A, RBBP5, TAF1, TBP, YY1
> R-HSA-74160 E2F4, ELF1, HDAC1, KDM5B, MYC, PML, POLR2A, RBBP5, TAF1, TBP, UBTF, YY1
> R-HSA-73857 E2F4, ELF1, HDAC1, KDM5B, MYC, PML, POLR2A, RBBP5, TAF1, TBP, YY1
> R-HSA-4839726 HDAC1, KDM5B, PHF8, RBBP5, SAP30
> R-HSA-3247509 HDAC1, KDM5B, PHF8, RBBP5, SAP30
> R-HSA-3700989 E2F4, HDAC1, PML, POLR2A, TAF1, TBP
> R-HSA-69278 E2F4, HDAC1, MAX, MYC, PHF8
> R-HSA-1640170 E2F4, HDAC1, MAX, MYC, PHF8
> R-HSA-597592 HDAC1, MYC, PML, RBBP5, YY1
> R-HSA-392499 HDAC1, MYC, PML, RBBP5, YY1
Enriched domains for TFs (low correlation):
xEnrichViewer(eT_low, top=NULL, details=T)
> name nAnno nOverlap fc
> R-HSA-212436 Generic Transcription Pathway 1063 10 5.19
> R-HSA-74160 Gene expression (Transcription) 1307 11 4.65
> R-HSA-73857 RNA Polymerase II Transcription 1185 10 4.66
> R-HSA-8878171 Transcriptional regulation by RUNX1 232 5 11.90
> R-HSA-1280215 Cytokine Signaling in Immune system 898 7 4.30
> R-HSA-449147 Signaling by Interleukins 683 6 4.85
> zscore pvalue adjp distance
> R-HSA-212436 6.16 3.7e-06 1.1e-05 3
> R-HSA-74160 6.02 2.6e-06 1.1e-05 1
> R-HSA-73857 5.72 9.9e-06 2.0e-05 2
> R-HSA-8878171 7.15 4.4e-05 6.7e-05 4
> R-HSA-1280215 4.42 6.3e-04 7.5e-04 2
> R-HSA-449147 4.44 9.4e-04 9.4e-04 3
> members
> R-HSA-212436 FOS, GATA1, GATA2, JUN, JUNB, NFE2, SMARCB1, SPI1, ZNF263, ZNF274
> R-HSA-74160 EZH2, FOS, GATA1, GATA2, JUN, JUNB, NFE2, SMARCB1, SPI1, ZNF263, ZNF274
> R-HSA-73857 FOS, GATA1, GATA2, JUN, JUNB, NFE2, SMARCB1, SPI1, ZNF263, ZNF274
> R-HSA-8878171 GATA1, GATA2, NFE2, SMARCB1, SPI1
> R-HSA-1280215 EZH2, FOS, JUN, JUNB, REST, STAT2, STAT5A
> R-HSA-449147 EZH2, FOS, JUN, JUNB, REST, STAT5A
In this showcase, we analyse TF and CRISPR datasets in K562.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF and CRISPR datasets in K562:
# extract TF, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
# calculate gene-centric TF association scores
anno_gr <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
mat <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
# import CRISPR
data.file <- file.path(RData.location,'TW_Science_CRISPR.txt')
data <- read.delim(data.file, header=TRUE, stringsAsFactors=FALSE)
data <- subset(data, K562.adjusted.p.value<0.05)
# extract genes commons to CRISPR and TF datasets in K562
ind <- match(data$Gene, rownames(mat))
G2TF_K562_crispr <- mat[ind[!is.na(ind)],]
G2CRISPR_K562 <- data.frame(CRISPR=data$K562.CS[!is.na(ind)], stringsAsFactors=F)
rownames(G2CRISPR_K562) <- data$Gene[!is.na(ind)]
Justify the use of the triangle-shaped map:
data <- G2TF_K562_crispr
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=24) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the triangle-shaped map using cross-TF gene scores:
data <- G2TF_K562_crispr
sM <- sPipeline(data, shape="triangle", finetuneSustain=T, scale=1)
The CRISPR map is obtained by overlaying the CRISPR data onto the trained TF map in K562:
sOverlay_CRISPR <- sMapOverlay(sM, data=G2TF_K562_crispr, additional=G2CRISPR_K562)
Visualise CRISPR map:
colormap <- xColormap("jet.both")
visHexMulComp(sOverlay_CRISPR, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(-3,3), gp=grid::gpar(cex=1), newpage=F)
Rankings of 100 TFs in terms of similarity (Pearson correlation) to the CRISPR map in K562:
M_K562 <- sM$codebook
M_CRISPR <- sOverlay_CRISPR$codebook
y <- M_CRISPR[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_CRISPR <- xPolarDot(df, size=1.5, parallel=T, signature=F)
gp_CRISPR + labs(y="Pearson correlation") + scale_y_reverse(limits=c(0,-1))
Visualise TF map (reordered by correlation):
xMap <- sM
ind <- match(df$TF, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=100:1, rect.grid=c(10,10), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)
Obtain gene clusters based on TF map:
xM <- sM
ind <- match(df$TF, colnames(xM$codebook))
xM$codebook <- xM$codebook[,ind]
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)
Calculate CRISPR scores per gene cluster:
colormap <- xColormap("jet.top")
output_CRISPR <- visDmatHeatmap(xM, data=G2TF_K562_crispr[,ind], sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(0,1), KeyValueName="TF-gene association", labRow=NA, srtCol=90, cexCol=0.6, keep.data=T)
## Polar barplot
ind <- match(output$ID, rownames(G2CRISPR_K562))
output$val <- G2CRISPR_K562[ind,]
ls_tmp <- split(x=output$val, f=output$Cluster_base)
ls_df <- lapply(1:length(ls_tmp), function(i){
x <- ls_tmp[[i]]
y <- median(x)
data.frame(Cluster=paste0('C',names(ls_tmp)[i]), Val=y, Num=length(x), stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
gp <- xPolarBar(df, colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp
Perform enrichment analysis for clustered genes:
ls_cluster <- split(x=output_CRISPR$ID, f=output_CRISPR$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- output_CRISPR$ID
ontologies <- c("SF","GOMF","REACTOME")
ls_res <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(10,2000), test="fisher", min.overlap=10, p.tail="one-tail", RData.location=RData.location, plot=T, fdr.cutoff=0.01, displayBy="fdr")
ls_res$gp + theme(legend.title=element_text(size=8))
In this showcase, we analyse GTEx V6p tissue-specific eGene datasets and ExAC loss-of-function (LoF) datasets.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import GTEx and ExAC datasets:
# import GTEx datasets
GTEx_V6p_matrix <- xRDataLoader(RData.customised='GTEx_V6p_matrix', RData.location=RData.location)
## brain or blood tissues
ind <- grepl('Brain|Blood', colnames(GTEx_V6p_matrix))
GTEx_V6p_matrix <- GTEx_V6p_matrix[,ind]
colnames(GTEx_V6p_matrix) <- gsub('Brain_','',colnames(GTEx_V6p_matrix))
## only significant eGenes (q-value<0.05)
GTEx_V6p_matrix <- -log10(GTEx_V6p_matrix)
ind <- apply(GTEx_V6p_matrix>-log10(0.05), 1, sum)
GTEx_matrix <- GTEx_V6p_matrix[ind!=0,]
# import ExAC datasets
ExAC <- xRDataLoader(RData.customised='ExAC', RData.location=RData.location)
ExAC_matrix <- data.frame(pLI=ExAC$pLI, stringsAsFactors=F)
rownames(ExAC_matrix) <- ExAC$Symbol
## ExAC LoF intolerance genes: binary (1 for yes, 0 for no)
ExAC_matrix[ExAC_matrix>0.9,1] <- 1
ExAC_matrix[ExAC_matrix<=0.9,1] <- 0
# extract genes commons to GTEx and ExAC datasets
ind <- match(rownames(ExAC_matrix), rownames(GTEx_matrix))
G2GTEx <- GTEx_matrix[ind[!is.na(ind)],]
G2ExAC <- data.frame(pLI=ExAC_matrix[!is.na(ind),], stringsAsFactors=F)
rownames(G2ExAC) <- rownames(ExAC_matrix)[!is.na(ind)]
Justify the use of the diamond-shaped map:
data <- G2GTEx
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=32) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the diamond-shaped map using GTEx eGene datasets:
data <- G2GTEx
sMg <- sPipeline(data, shape="diamond", finetuneSustain=T, scale=3)
The ExAC map is obtained by overlaying the ExAC data onto the trained GTEx map:
sOverlay_ExAC <- sMapOverlay(sMg, data=G2GTEx, additional=G2ExAC)
Visualise ExAC map:
colormap <- xColormap("jet.top")
visHexMulComp(sOverlay_ExAC, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(0,0.4), gp=grid::gpar(cex=1), newpage=F)
Rankings of 11 tissues in terms of similarity (Pearson correlation) to the ExAC map:
M_GTEx <- sMg$codebook
M_ExAC <- sOverlay_ExAC$codebook
y <- M_ExAC[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_GTEx), function(i){
x <- data.frame(name=1:nrow(M_GTEx), M_GTEx[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(tissue=colnames(M_GTEx)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar dotplot
df_tmp <- df
df_tmp$tissue <- gsub('_','\n',df_tmp$tissue)
gp_GTEx <- xPolarDot(df_tmp, size=1.5, parallel=T, signature=F)
gp_GTEx + labs(y="Pearson correlation") + scale_y_reverse(limits=c(0,-1))
Visualise GTEx map (reordered by correlation):
xMap <- sMg
ind <- match(df$tissue, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=11:1, rect.grid=c(1,11), title.rotate=30, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,6), gp=grid::gpar(cex=0.6), newpage=F)
Obtain gene clusters based on GTEx tissue map:
xM <- sMg
ind <- match(df$tissue, colnames(xM$codebook))
xM$codebook <- xM$codebook[,ind]
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)
Calculate the probability of genes being LoF intolerant per gene cluster:
colormap <- xColormap("jet.top")
output_ExAC <- visDmatHeatmap(xM, data=G2GTEx[,ind], sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(0,6), KeyValueName=expression(-log[10]("q-value")), labRow=NA, srtCol=90, cexCol=0.6, keep.data=T)
## Polar barplot
ind <- match(output_ExAC$ID, rownames(G2ExAC))
output_ExAC$val <- G2ExAC[ind,'pLI']
ls_tmp <- split(x=output_ExAC$val, f=output_ExAC$Cluster_base)
ls_df <- lapply(1:length(ls_tmp), function(i){
x <- ls_tmp[[i]]
y <- mean(x)
data.frame(Cluster=paste0('C',names(ls_tmp)[i]), Val=y, Num=length(x), stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
gp <- xPolarBar(df, colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp
Perform enrichment analysis for clustered genes:
ls_cluster <- split(x=output_ExAC$ID, f=output_ExAC$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- output_ExAC$ID
ontologies <- "CGL"
ls_res_CGL <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(10,20000), test="hypergeo", min.overlap=5, p.tail="two-tails", RData.location=RData.location, plot=T, fdr.cutoff=0.05, displayBy="zscore")
ls_res_CGL$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))
Perform evolutionary analysis for two groups of genes:
ls_cluster <- split(x=output_ExAC$ID, f=output_ExAC$Cluster_base)
# define two groups
## group 1: clusters 2-5
## group 2: clusters 1 and 6-9
ls_group <- list(group1=unique(unlist(ls_cluster[c(2,3,4,5)])), group2=unique(unlist(ls_cluster[c(1,6,7,8,9)])))
background <- NULL
ontologies <- "PS2"
ls_res_PS2 <- xEnricherGenesAdv(list_vec=ls_group, background=background, ontologies=ontologies, size.range=c(10,20000), test="hypergeo", min.overlap=5, p.tail="one-tail", RData.location=RData.location, plot=T, fdr.cutoff=1, displayBy="fdr")
ls_res_PS2$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))
In this showcase, we analyse haploid genetic screen datasets together with expression and ExAC loss-of-function (LoF) datasets.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import Haploid and define combined datasets (expression and LoF):
# import haploid datasets (only positive regulators with negative mutation index)
Haploid <- xRDataLoader(RData.customised='Haploid_regulators', RData.location=RData.location)
## all phenotypes except 'HMGCR'
ind <- grepl('HMGCR', Haploid$Phenotype)
df <- Haploid[!ind,]
df <- df[df$MI<0,]
Haploid_matrix <- as.matrix(xSparseMatrix(df[,c('Gene','Phenotype','MI')]))
# number of phenotypes per gene
vec_count <- apply(Haploid_matrix!=0, 1, sum)
# median expression
EXP <- xRDataLoader(RData.customised='Haploid_expression', RData.location=RData.location)
vec_expression <- apply(EXP, 1, median)
# ExAC LoF intolerant
ExAC <- xRDataLoader(RData.customised='ExAC', RData.location=RData.location)
ExAC_matrix <- data.frame(pLI=ExAC$pLI, stringsAsFactors=F)
rownames(ExAC_matrix) <- ExAC$Symbol
ExAC_matrix[ExAC_matrix>0.9,1] <- 1
ExAC_matrix[ExAC_matrix<=0.9,1] <- 0
vec_lof <- ExAC_matrix[,1]
names(vec_lof) <- rownames(ExAC_matrix)
# combine three above
mat <- matrix(0, nrow=nrow(Haploid_matrix), ncol=3)
rownames(mat) <- rownames(Haploid_matrix)
colnames(mat) <- c('Count','Expression','LoF')
ind <- match(rownames(mat), names(vec_count))
mat[!is.na(ind),1] <- vec_count[ind[!is.na(ind)]]
ind <- match(rownames(mat), names(vec_expression))
mat[!is.na(ind),2] <- vec_expression[ind[!is.na(ind)]]
ind <- match(rownames(mat), names(vec_lof))
mat[!is.na(ind),3] <- vec_lof[ind[!is.na(ind)]]
Combined_matrix <- mat
# extract genes commons to Haploid and Combined datasets
ind <- match(rownames(Combined_matrix), rownames(Haploid_matrix))
G2Haploid <- Haploid_matrix[ind[!is.na(ind)],]
G2Combined <- data.frame(Combined_matrix[!is.na(ind),], stringsAsFactors=F)
rownames(G2Combined) <- rownames(Combined_matrix)[!is.na(ind)]
Train the trefoil-shaped map using Haploid genetic regulator datasets:
sMh <- sPipeline(data=G2Haploid, shape="trefoil", finetuneSustain=T, scale=3)
The map is obtained by overlaying the combined datasets onto the trained Haploid protein map:
sO_Combined <- sMapOverlay(sMh, data=G2Haploid, additional=G2Combined)
Visualise overlaid count map:
colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=1, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(1,5), gp=grid::gpar(cex=1), newpage=F)
Visualise overlaid expression map:
colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=2, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(6,10), gp=grid::gpar(cex=1), newpage=F)
Visualise overlaid LoF map:
colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=3, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(0.3,0.7), gp=grid::gpar(cex=1), newpage=F)
Visualise Haploid protein map in 2D landscape:
sReorder <- sCompReorder(sMh, metric="euclidean")
colormap <- colorRampPalette(c("blue","#007FFF","cyan"), interpolate="spline")
visCompReorder(sMh, sReorder, title.rotate=0, title.xy=c(0.4,0.95), colormap=colormap, zlim=c(-1,0), gp=grid::gpar(cex=0.6), newpage=F)
Obtain gene clusters based on Haploid protein map:
xM <- sMh
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)
Calculate the overlaid average per gene cluster:
colormap <- xColormap("jet.both")
output_Combined <- visDmatHeatmap(xM, data=G2Haploid, sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(-1,1), KeyValueName=expression(-log[2]("Mutation index")), labRow=NA, reorderRow="none", srtCol=90, cexCol=0.6, keep.data=T)
## Polar barplot
ind <- match(output_Combined$ID, rownames(G2Combined))
df_tmp <- cbind(output_Combined, G2Combined[ind,])
df <- as.data.frame(df_tmp %>% dplyr::mutate(Cluster=paste0('C',Cluster_base)) %>% dplyr::group_by(Cluster) %>% dplyr::summarise(Count=mean(Count), Expression=mean(Expression),LoF=mean(LoF)))
ls_gp <- lapply(2:ncol(df), function(i){
gp <- xPolarBar(df[,c(1,i)], colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp + labs(title=colnames(df)[i])
})
gridExtra::grid.arrange(grobs=ls_gp, nrow=1, ncol=3)
Perform enrichment analysis for clustered genes:
ls_cluster <- split(x=output_Combined$ID, f=output_Combined$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- NULL
ontologies <- "REACTOME"
ls_res_reactome <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(20,500), test="hypergeo", min.overlap=10, p.tail="one-tail", RData.location=RData.location, plot=T, fdr.cutoff=0.01, displayBy="zscore")
#ls_res_reactome$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))
df <- ls_res_reactome$df
df <- as.data.frame(df %>% dplyr::filter(adjp < 0.01))
mat <- as.matrix(xSparseMatrix(df[,c('name','group','zscore')], columns=names(ls_cluster)))
mat[mat==0] <- NA
gp <- xHeatmap(t(mat), reorder=c("none","row","col","both")[3], colormap="#E6F598-#ABDDA4-#66C2A5-#3288BD", zlim=c(2,10), na.color='transparent', size=2.5, x.rotate=30, x.text.size=5, y.text.size=6, barheight=5, legend.title='Z-score')
gp